环境设置

In [1]:
# 检查你的Python版本
from sys import version_info
if version_info.major != 2 and version_info.minor != 7:
    raise Exception('请使用Python 2.7来完成此项目')

载入数据

In [2]:
# 引入这个项目需要的库
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import display # 使得我们可以对DataFrame使用display()函数
import xgboost as xgb
import time
from sklearn.cross_validation import train_test_split

# 设置以内联的形式显示matplotlib绘制的图片(在notebook中显示更美观)
%matplotlib inline

# 载入客户数据集
try:
    types={
#         'Store': np.dtype(obj),
#         'DayOfWeek': np.dtype(str),
        'Date': np.dtype(str),
        'Sales': np.dtype(float),
        'Customers': np.dtype(int),
#         'Open': np.dtype(int),
        'Promo': np.dtype(int),
#         'StateHoliday': np.dtype(str),
        'SchoolHoliday': np.dtype(int),
        'StoreType': np.dtype(str),
        'Assortment': np.dtype(str),
        'CompetitionDistance': np.dtype(float),
        'CompetitionOpenSinceYear': np.dtype(int),
        'CompetitionOpenSinceMonth': np.dtype(int),
        'Promo2': np.dtype(int),
        'Promo2SinceWeek': np.dtype(int),
        'Promo2SinceYear': np.dtype(int),
        'PromoInterval': np.dtype(str)
          }
    df_train = pd.read_csv("./train.csv",parse_dates=['Date'],date_parser=(lambda dt: pd.to_datetime(dt, format='%Y-%m-%d')),dtype=types)
    df_test = pd.read_csv("./test.csv",parse_dates=['Date'],date_parser=(lambda dt: pd.to_datetime(dt, format='%Y-%m-%d')),dtype=types)
    df_store = pd.read_csv("./store.csv")
except:
    print "Dataset could not be loaded. Is the dataset missing?"
i:\Anaconda2\envs\rossmann\lib\site-packages\sklearn\cross_validation.py:41: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.
  "This module will be removed in 0.20.", DeprecationWarning)
i:\Anaconda2\envs\rossmann\lib\site-packages\IPython\core\interactiveshell.py:2714: DtypeWarning: Columns (7) have mixed types. Specify dtype option on import or set low_memory=False.
  interactivity=interactivity, compiler=compiler, result=result)

数据处理

In [3]:
#查找缺失值
for i in df_train,df_test,df_store:
    print i.shape
    print i.isnull().sum()
    print ""
(1017209, 9)
Store            0
DayOfWeek        0
Date             0
Sales            0
Customers        0
Open             0
Promo            0
StateHoliday     0
SchoolHoliday    0
dtype: int64

(41088, 8)
Id                0
Store             0
DayOfWeek         0
Date              0
Open             11
Promo             0
StateHoliday      0
SchoolHoliday     0
dtype: int64

(1115, 10)
Store                          0
StoreType                      0
Assortment                     0
CompetitionDistance            3
CompetitionOpenSinceMonth    354
CompetitionOpenSinceYear     354
Promo2                         0
Promo2SinceWeek              544
Promo2SinceYear              544
PromoInterval                544
dtype: int64

In [4]:
df_store['Promo2SinceWeek'].astype('category').cat.categories
Out[4]:
Float64Index([ 1.0,  5.0,  6.0,  9.0, 10.0, 13.0, 14.0, 18.0, 22.0, 23.0, 26.0,
              27.0, 28.0, 31.0, 35.0, 36.0, 37.0, 39.0, 40.0, 44.0, 45.0, 48.0,
              49.0, 50.0],
             dtype='float64')
In [5]:
#对df_train df_test 设置变量,并且合并,便于查找
df_train['Train'] = 1
df_train['Test'] = 0
df_test['Train'] = 0
df_test['Test'] = 1
df_raw = pd.concat([df_train, df_test],sort=False)
In [6]:
#显示合并后数据属性
df_raw.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 1058297 entries, 0 to 41087
Data columns (total 12 columns):
Store            1058297 non-null int64
DayOfWeek        1058297 non-null int64
Date             1058297 non-null datetime64[ns]
Sales            1017209 non-null float64
Customers        1017209 non-null float64
Open             1058286 non-null float64
Promo            1058297 non-null int32
StateHoliday     1058297 non-null object
SchoolHoliday    1058297 non-null int32
Train            1058297 non-null int64
Test             1058297 non-null int64
Id               41088 non-null float64
dtypes: datetime64[ns](1), float64(4), int32(2), int64(4), object(1)
memory usage: 96.9+ MB
In [7]:
#显示df_store数据属性
df_store.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1115 entries, 0 to 1114
Data columns (total 10 columns):
Store                        1115 non-null int64
StoreType                    1115 non-null object
Assortment                   1115 non-null object
CompetitionDistance          1112 non-null float64
CompetitionOpenSinceMonth    761 non-null float64
CompetitionOpenSinceYear     761 non-null float64
Promo2                       1115 non-null int64
Promo2SinceWeek              571 non-null float64
Promo2SinceYear              571 non-null float64
PromoInterval                571 non-null object
dtypes: float64(5), int64(2), object(3)
memory usage: 87.2+ KB
In [8]:
#将测试集open缺省值都记为1
df_raw.loc[df_raw.Open.isnull(), 'Open'] = 1
In [9]:
#对各项数据进行counter发现有所有异常的列
from collections import Counter
def show_me_options(data,index):
    return str(index) + ":" + str(Counter(data[index]))
In [10]:
show_me_options(df_store,'PromoInterval')
Out[10]:
"PromoInterval:Counter({nan: 544, 'Jan,Apr,Jul,Oct': 335, 'Feb,May,Aug,Nov': 130, 'Mar,Jun,Sept,Dec': 106})"
In [11]:
df_raw.Open = df_raw.Open.astype(int)
In [12]:
#将开店状态为1的,但是销售为0的剔除在训练外
df_raw.loc[((df_raw['Open'] == 1) & (df_raw['Sales'] == 0)),'Train'] = 0
In [13]:
#将stateholiday特征转化为int
df_raw['StateHoliday'] = df_raw['StateHoliday'].astype('category').cat.codes
In [14]:
show_me_options(df_raw,'StateHoliday')
Out[14]:
'StateHoliday:Counter({1: 895995, 0: 131072, 2: 20440, 3: 6690, 4: 4100})'
In [15]:
#将StoreType特征和Assortment特征转化为int
df_store['StoreType'] = df_store['StoreType'].astype('category').cat.codes
df_store['Assortment'] = df_store['Assortment'].astype('category').cat.codes
In [16]:
show_me_options(df_store,'StoreType')
Out[16]:
'StoreType:Counter({0: 602, 3: 348, 2: 148, 1: 17})'
In [17]:
show_me_options(df_store,'Assortment')
Out[17]:
'Assortment:Counter({0: 593, 2: 513, 1: 9})'
In [18]:
show_me_options(df_store,'PromoInterval')
Out[18]:
"PromoInterval:Counter({nan: 544, 'Jan,Apr,Jul,Oct': 335, 'Feb,May,Aug,Nov': 130, 'Mar,Jun,Sept,Dec': 106})"
In [19]:
df_store['PromoInterval'].astype('category').cat.categories
Out[19]:
Index([u'Feb,May,Aug,Nov', u'Jan,Apr,Jul,Oct', u'Mar,Jun,Sept,Dec'], dtype='object')
In [20]:
df_store['PromoInterval'] = df_store['PromoInterval'].astype('category').cat.codes
In [21]:
#PromoInterval处理df_store的PromoInterval特征
df_store['PromoInterval'].astype('category').cat.categories
Out[21]:
Int64Index([-1, 0, 1, 2], dtype='int64')
In [22]:
#PromoInterval处理df_store的PromoInterval特征
df_store['PromoInterval'] = df_store['PromoInterval'].astype('category').cat.rename_categories([-1,2,1,0])
In [23]:
df_store.loc[(df_store['PromoInterval']==-1),'PromoInterval']=np.nan
In [24]:
df_store.columns
Out[24]:
Index([u'Store', u'StoreType', u'Assortment', u'CompetitionDistance',
       u'CompetitionOpenSinceMonth', u'CompetitionOpenSinceYear', u'Promo2',
       u'Promo2SinceWeek', u'Promo2SinceYear', u'PromoInterval'],
      dtype='object')
In [25]:
#将competitionOpen转化为Timestamp
def CompetionOpenTrans2TS(df):
    try:
        date = '{}-{}'.format(int(df['CompetitionOpenSinceYear']), int(df['CompetitionOpenSinceMonth']))
        return time.mktime(time.strptime(date, '%Y-%m'))
    except:
        return np.nan
In [26]:
df_store['CompetitionOpenTS'] = df_store.apply(lambda df_store: CompetionOpenTrans2TS(df_store), axis=1)
In [27]:
#将Promo2转化为Timestamp
def Promo2Trans2TS(df_store):
    try:
        date = '{}-{}'.format(int(df_store['Promo2SinceYear']), int(df_store['Promo2SinceWeek']))
        return  time.mktime(time.strptime(date, '%Y-%W'))
    except:
        return np.nan
In [28]:
df_store['Promo2TS'] = df_store.apply(lambda df_store: Promo2Trans2TS(df_store), axis=1)
In [29]:
#对每个store的sales, customer 和open 求和
data_sales = df_raw.groupby([df_raw['Store']])['Sales'].sum()
data_customers = df_raw.groupby([df_raw['Store']])['Customers'].sum()
data_open = df_raw.groupby([df_raw['Store']])['Open'].count()
In [30]:
#进行计算
data_sales_per_day = data_sales / data_open
data_customers_per_day = data_customers / data_open
data_sales_per_customer_per_day = data_sales_per_day /data_customers_per_day
In [31]:
df_store = pd.merge(df_store, data_sales_per_day.reset_index(name='SalesPerDay'), how='left', on=['Store'])
df_store = pd.merge(df_store, data_customers_per_day.reset_index(name='CustomersPerDay'), how='left', on=['Store'])
df_store = pd.merge(df_store, data_sales_per_customer_per_day.reset_index(name='SalesPerCustomersPerDay'), how='left', on=['Store'])
In [32]:
df_store.columns
Out[32]:
Index([u'Store', u'StoreType', u'Assortment', u'CompetitionDistance',
       u'CompetitionOpenSinceMonth', u'CompetitionOpenSinceYear', u'Promo2',
       u'Promo2SinceWeek', u'Promo2SinceYear', u'PromoInterval',
       u'CompetitionOpenTS', u'Promo2TS', u'SalesPerDay', u'CustomersPerDay',
       u'SalesPerCustomersPerDay'],
      dtype='object')
In [33]:
features_store=['Store','StoreType','Assortment','CompetitionDistance','Promo2',\
                'PromoInterval','CompetitionOpenTS','Promo2TS','SalesPerDay',\
               'CustomersPerDay','SalesPerCustomersPerDay']

train = pd.merge(df_raw,df_store[features_store], on='Store')
# test = pd.merge(df_test,df_store, on='Store')
In [34]:
def checkpromo(df):
    try:
        if int(df['Month']%3) == int(df['PromoInterval']):
            return 1
        else:
            return 0
    except:
        return np.nan
In [35]:
#将Promo2转化为Timestamp
def dateTrans2TS(train):
    try:
        date = '{}-{}-{}'.format(int(train['Year']), int(train['Month']),int(train['Day']))
        return  time.mktime(time.strptime(date, '%Y-%m-%d'))
    except:
        return np.nan
In [36]:
features=[]
#将可以直接用的特征放入features
features.extend(['Store', 'Promo', 'SchoolHoliday', 'StateHoliday', 'StoreType', 'Assortment', 'Promo2', 'CompetitionDistance',\
                'CompetitionOpenTS','Promo2TS','SalesPerDay','CustomersPerDay','SalesPerCustomersPerDay','IsPromoMonth'])
#放入时间相关信息
features.extend(['DayOfWeek', 'Month', 'Day', 'Year','WeekOfYear','DateTS'])
train['Year'] = train.Date.dt.year
train['Month'] = train.Date.dt.month
train['Day'] = train.Date.dt.day
train['WeekOfYear'] = train.Date.dt.weekofyear
train['DateTS'] = train.apply(lambda train: dateTrans2TS(train), axis=1)
# 检查日期是否在促销月
train['IsPromoMonth']=train.apply(lambda train: checkpromo(train), axis=1)
In [37]:
print(features)
['Store', 'Promo', 'SchoolHoliday', 'StateHoliday', 'StoreType', 'Assortment', 'Promo2', 'CompetitionDistance', 'CompetitionOpenTS', 'Promo2TS', 'SalesPerDay', 'CustomersPerDay', 'SalesPerCustomersPerDay', 'IsPromoMonth', 'DayOfWeek', 'Month', 'Day', 'Year', 'WeekOfYear', 'DateTS']
In [38]:
train.columns
Out[38]:
Index([u'Store', u'DayOfWeek', u'Date', u'Sales', u'Customers', u'Open',
       u'Promo', u'StateHoliday', u'SchoolHoliday', u'Train', u'Test', u'Id',
       u'StoreType', u'Assortment', u'CompetitionDistance', u'Promo2',
       u'PromoInterval', u'CompetitionOpenTS', u'Promo2TS', u'SalesPerDay',
       u'CustomersPerDay', u'SalesPerCustomersPerDay', u'Year', u'Month',
       u'Day', u'WeekOfYear', u'DateTS', u'IsPromoMonth'],
      dtype='object')

数据可视化

In [77]:
# 销售额均值-按星期几排序
sns.factorplot(data = train, x = 'Month', y = "Sales", 
               col = 'DayOfWeek', 
               palette = 'plasma',
               hue = 'DayOfWeek') 
Out[77]:
<seaborn.axisgrid.FacetGrid at 0x6a9741d0>
In [40]:
# 销售额-按月份
sns.factorplot(data = train, x = 'StoreType', y = "Sales", 
               col = 'Month', 
               palette = 'plasma',
               hue = 'Month') 
Out[40]:
<seaborn.axisgrid.FacetGrid at 0xeb960f0>
In [41]:
# 销售额-按店类型
sns.factorplot(data = train, x = 'Month', y = "Sales", 
               col = 'StoreType', 
               palette = 'plasma',
               hue = 'StoreType') 
Out[41]:
<seaborn.axisgrid.FacetGrid at 0xeb96668>
In [42]:
# 销售额均值-对比有无促销(Promo)
sns.factorplot(data = train, x = 'Month', y = "Sales", 
               col = 'Promo', 
               palette = 'plasma',
               hue = 'Promo',
               ) 
Out[42]:
<seaborn.axisgrid.FacetGrid at 0x66646518>
In [43]:
# 销售额均值-对比StoreType在促销的情况下
sns.factorplot(data = train, x = 'Month', y = "Sales", 
               col = 'Promo', 
               palette = 'plasma',
               hue = 'StoreType') 
Out[43]:
<seaborn.axisgrid.FacetGrid at 0x3df19048>

排查异常数据

In [80]:
#手动排查异常数据
a=126
list_stores_to_check = range(a,a+5,1)
# 105,126,163,172,259,274,339,349,353,364,378,404,512,517,523,560,589,663,673,676,681,700,733,762,764,769,816,824,837,845,861,925,940,969,986
#991,1039,1068,1097,1115
plt.rcParams["figure.figsize"] = [20,len(list_stores_to_check)*5]

j = 1
for i in list_stores_to_check:
    store = i

    # Normal sales
    X1 = train.loc[(train['Store'] == store) & (train['Open'] == 1)]
    y1 = train.loc[(train['Store'] == store) & (train['Open'] == 1)]['Sales']

    Xt = train.loc[(train['Store'] == store)]
    
    plt.subplot(len(list_stores_to_check),1,j)
    plt.plot(X1['Date'], y1, '-')
    plt.minorticks_on()
    plt.grid(True, which='both')
    plt.title(i)
    j += 1
In [50]:
#手动排查异常数据
list_stores_to_check = [105,126,163,172,259,274,339,349,353,364,378,404,512,517,523,560,589,663,673,676]
# 105,126,163,172,259,274,339,349,353,364,378,404,512,517,523,560,589,663,673,676,681,700,733,762,764,769,816,824,837,845,861,925,940,969,986
#991,1039,1068,1097,1115
plt.rcParams["figure.figsize"] = [20,len(list_stores_to_check)*5]

j = 1
for i in list_stores_to_check:
    store = i

    # Normal sales
    X1 = train.loc[(train['Store'] == store) & (train['Open'] == 1)]
    y1 = train.loc[(train['Store'] == store) & (train['Open'] == 1)]['Sales']

    Xt = train.loc[(train['Store'] == store)]
    
    plt.subplot(len(list_stores_to_check),1,j)
    plt.plot(X1['DateTS'], y1, '-')
    plt.minorticks_on()
    plt.grid(True, which='both')
    plt.title(i)
    j += 1
In [51]:
#手动排查异常数据
a=1100
list_stores_to_check = [681,700,733,762,764,769,816,824,837,845,861,925,940,969,986,991,1068,1097,1115]
# 105,126,163,172,259,274,339,349,353,364,378,404,512,517,523,560,589,663,673,676,681,700,733,762,764,769,816,824,837,845,861,925,940,969,986
#991,1039,1068,1097,1115
plt.rcParams["figure.figsize"] = [20,len(list_stores_to_check)*5]

j = 1
for i in list_stores_to_check:
    store = i

    # Normal sales
    X1 = train.loc[(train['Store'] == store) & (train['Open'] == 1)]
    y1 = train.loc[(train['Store'] == store) & (train['Open'] == 1)]['Sales']

    Xt = train.loc[(train['Store'] == store)]
    
    plt.subplot(len(list_stores_to_check),1,j)
    plt.plot(X1['DateTS'], y1, '-')
    plt.minorticks_on()
    plt.grid(True, which='both')
    plt.title(i)
    j += 1
In [52]:
store_dates_to_remove = {105:1.368e09,
                         126:1.384e09,
                         163:1.366e09,
                         172:1.366e09,
                         259:1.362e09,
                         274:1.362e09,
                         339:1.362e09,
                         349:1.368e09,
                         353:1.364e09,
                         364:1.37e09,
                         378:1.388e09,
                         404:1.36e09,
                         512:1.36e09,
                         517:1.362e09,
                         523:1.382e09,
                         560:1.362e09,
                         589:1.368e09,
                         663:1.382e09,
                         673:1.364e09,
                         676:1.366e09,
                         681:1.37e09,
                         700:1.372e09,
                         733:1.362e09,
                         762:1.36e09,
                         764:1.366e09,
                         769:1.362e09,
                         816:1.372e09,
                         824:1.378e09,
                         837:1.394e09,
                         845:1.364e09,
                         861:1.364e09,
                         925:1.362e09,
                         940:1.362e09,
                         969:1.362e09,
                         986:1.366e09,
                         991:1.362e09,
                         1068:1.36e09,
                         1097:1.36e09,
                         1115:1.364e09}

for key,value in store_dates_to_remove.iteritems():
    train.loc[(train['Store'] == key) & (train['DateTS'] < value), 'Train'] = 0
In [53]:
list_stores_to_check = [105,126,163,172,259,274,339,349,353,364,378,404,512,517,523,560,589,663,673,676]

plt.rcParams["figure.figsize"] = [20,len(list_stores_to_check)*5]

j = 1
for i in list_stores_to_check:
    stor = i

    # Normal sales
    X1 = train.loc[(train['Store'] == stor) & (train['Open'] == 1) & (train['Train'] == 1)]
    y1 = train.loc[(train['Store'] == stor) & (train['Open'] == 1) & (train['Train'] == 1)]['Sales']
    
    X2 = train.loc[(train['Store'] == stor) & (train['Open'] == 1) & (train['Train'] == 0)]
    y2 = train.loc[(train['Store'] == stor) & (train['Open'] == 1) & (train['Train'] == 0)]['Sales']

    Xt = train.loc[(train['Store'] == stor)]
    
    plt.subplot(len(list_stores_to_check),1,j)
    plt.plot(X1['DateTS'], y1, '-')
    plt.plot(X2['DateTS'], y2, 'r')
    plt.minorticks_on()
    plt.grid(True, which='both')
    plt.title(i)
    j += 1
In [54]:
list_stores_to_check = [681,700,733,762,764,769,816,824,837,845,861,925,940,969,986,991,1068,1097,1115]

plt.rcParams["figure.figsize"] = [20,len(list_stores_to_check)*5]

j = 1
for i in list_stores_to_check:
    stor = i

    # Normal sales
    X1 = train.loc[(train['Store'] == stor) & (train['Open'] == 1) & (train['Train'] == 1)]
    y1 = train.loc[(train['Store'] == stor) & (train['Open'] == 1) & (train['Train'] == 1)]['Sales']
    
    X2 = train.loc[(train['Store'] == stor) & (train['Open'] == 1) & (train['Train'] == 0)]
    y2 = train.loc[(train['Store'] == stor) & (train['Open'] == 1) & (train['Train'] == 0)]['Sales']

    Xt = train.loc[(train['Store'] == stor)]
    
    plt.subplot(len(list_stores_to_check),1,j)
    plt.plot(X1['DateTS'], y1, '-')
    plt.plot(X2['DateTS'], y2, 'r')
    plt.minorticks_on()
    plt.grid(True, which='both')
    plt.title(i)
    j += 1
In [55]:
def check_outlier(points, t):
    if len(points.shape) == 1:
        points = points[:,None]
    median = np.median(points, axis=0)
    diff = np.sum((points - median)**2, axis=-1)
    diff = np.sqrt(diff)
    med_abs_deviation = np.median(diff)

    modified_z_score = 0.6745 * diff / med_abs_deviation

    return modified_z_score > t
In [56]:
train.loc[(train['Store'] == 1) & (train['Train'] == 1)]
Out[56]:
Store DayOfWeek Date Sales Customers Open Promo StateHoliday SchoolHoliday Train ... Promo2TS SalesPerDay CustomersPerDay SalesPerCustomersPerDay Year Month Day WeekOfYear DateTS IsPromoMonth
0 1 5 2015-07-31 5263.0 555.0 1 1 1 1 1 ... NaN 3754.39798 444.972727 8.437366 2015 7 31 31 1.438272e+09 NaN
1 1 4 2015-07-30 5020.0 546.0 1 1 1 1 1 ... NaN 3754.39798 444.972727 8.437366 2015 7 30 31 1.438186e+09 NaN
2 1 3 2015-07-29 4782.0 523.0 1 1 1 1 1 ... NaN 3754.39798 444.972727 8.437366 2015 7 29 31 1.438099e+09 NaN
3 1 2 2015-07-28 5011.0 560.0 1 1 1 1 1 ... NaN 3754.39798 444.972727 8.437366 2015 7 28 31 1.438013e+09 NaN
4 1 1 2015-07-27 6102.0 612.0 1 1 1 1 1 ... NaN 3754.39798 444.972727 8.437366 2015 7 27 31 1.437926e+09 NaN
5 1 7 2015-07-26 0.0 0.0 0 0 1 0 1 ... NaN 3754.39798 444.972727 8.437366 2015 7 26 30 1.437840e+09 NaN
6 1 6 2015-07-25 4364.0 500.0 1 0 1 0 1 ... NaN 3754.39798 444.972727 8.437366 2015 7 25 30 1.437754e+09 NaN
7 1 5 2015-07-24 3706.0 459.0 1 0 1 0 1 ... NaN 3754.39798 444.972727 8.437366 2015 7 24 30 1.437667e+09 NaN
8 1 4 2015-07-23 3769.0 503.0 1 0 1 0 1 ... NaN 3754.39798 444.972727 8.437366 2015 7 23 30 1.437581e+09 NaN
9 1 3 2015-07-22 3464.0 463.0 1 0 1 0 1 ... NaN 3754.39798 444.972727 8.437366 2015 7 22 30 1.437494e+09 NaN
10 1 2 2015-07-21 3558.0 469.0 1 0 1 0 1 ... NaN 3754.39798 444.972727 8.437366 2015 7 21 30 1.437408e+09 NaN
11 1 1 2015-07-20 4395.0 526.0 1 0 1 0 1 ... NaN 3754.39798 444.972727 8.437366 2015 7 20 30 1.437322e+09 NaN
12 1 7 2015-07-19 0.0 0.0 0 0 1 0 1 ... NaN 3754.39798 444.972727 8.437366 2015 7 19 29 1.437235e+09 NaN
13 1 6 2015-07-18 4406.0 512.0 1 0 1 0 1 ... NaN 3754.39798 444.972727 8.437366 2015 7 18 29 1.437149e+09 NaN
14 1 5 2015-07-17 4852.0 519.0 1 1 1 0 1 ... NaN 3754.39798 444.972727 8.437366 2015 7 17 29 1.437062e+09 NaN
15 1 4 2015-07-16 4427.0 517.0 1 1 1 0 1 ... NaN 3754.39798 444.972727 8.437366 2015 7 16 29 1.436976e+09 NaN
16 1 3 2015-07-15 4767.0 550.0 1 1 1 0 1 ... NaN 3754.39798 444.972727 8.437366 2015 7 15 29 1.436890e+09 NaN
17 1 2 2015-07-14 5042.0 544.0 1 1 1 0 1 ... NaN 3754.39798 444.972727 8.437366 2015 7 14 29 1.436803e+09 NaN
18 1 1 2015-07-13 5054.0 553.0 1 1 1 0 1 ... NaN 3754.39798 444.972727 8.437366 2015 7 13 29 1.436717e+09 NaN
19 1 7 2015-07-12 0.0 0.0 0 0 1 0 1 ... NaN 3754.39798 444.972727 8.437366 2015 7 12 28 1.436630e+09 NaN
20 1 6 2015-07-11 3530.0 441.0 1 0 1 0 1 ... NaN 3754.39798 444.972727 8.437366 2015 7 11 28 1.436544e+09 NaN
21 1 5 2015-07-10 3808.0 449.0 1 0 1 0 1 ... NaN 3754.39798 444.972727 8.437366 2015 7 10 28 1.436458e+09 NaN
22 1 4 2015-07-09 3897.0 480.0 1 0 1 0 1 ... NaN 3754.39798 444.972727 8.437366 2015 7 9 28 1.436371e+09 NaN
23 1 3 2015-07-08 3797.0 485.0 1 0 1 0 1 ... NaN 3754.39798 444.972727 8.437366 2015 7 8 28 1.436285e+09 NaN
24 1 2 2015-07-07 3650.0 485.0 1 0 1 0 1 ... NaN 3754.39798 444.972727 8.437366 2015 7 7 28 1.436198e+09 NaN
25 1 1 2015-07-06 4359.0 540.0 1 0 1 0 1 ... NaN 3754.39798 444.972727 8.437366 2015 7 6 28 1.436112e+09 NaN
26 1 7 2015-07-05 0.0 0.0 0 0 1 0 1 ... NaN 3754.39798 444.972727 8.437366 2015 7 5 27 1.436026e+09 NaN
27 1 6 2015-07-04 4797.0 560.0 1 0 1 0 1 ... NaN 3754.39798 444.972727 8.437366 2015 7 4 27 1.435939e+09 NaN
28 1 5 2015-07-03 4665.0 538.0 1 1 1 0 1 ... NaN 3754.39798 444.972727 8.437366 2015 7 3 27 1.435853e+09 NaN
29 1 4 2015-07-02 5558.0 573.0 1 1 1 0 1 ... NaN 3754.39798 444.972727 8.437366 2015 7 2 27 1.435766e+09 NaN
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
912 1 3 2013-01-30 4601.0 560.0 1 0 1 0 1 ... NaN 3754.39798 444.972727 8.437366 2013 1 30 5 1.359475e+09 NaN
913 1 2 2013-01-29 3725.0 522.0 1 0 1 0 1 ... NaN 3754.39798 444.972727 8.437366 2013 1 29 5 1.359389e+09 NaN
914 1 1 2013-01-28 4055.0 549.0 1 0 1 0 1 ... NaN 3754.39798 444.972727 8.437366 2013 1 28 5 1.359302e+09 NaN
915 1 7 2013-01-27 0.0 0.0 0 0 1 0 1 ... NaN 3754.39798 444.972727 8.437366 2013 1 27 4 1.359216e+09 NaN
916 1 6 2013-01-26 5598.0 640.0 1 0 1 0 1 ... NaN 3754.39798 444.972727 8.437366 2013 1 26 4 1.359130e+09 NaN
917 1 5 2013-01-25 5586.0 676.0 1 1 1 0 1 ... NaN 3754.39798 444.972727 8.437366 2013 1 25 4 1.359043e+09 NaN
918 1 4 2013-01-24 5195.0 645.0 1 1 1 0 1 ... NaN 3754.39798 444.972727 8.437366 2013 1 24 4 1.358957e+09 NaN
919 1 3 2013-01-23 5578.0 627.0 1 1 1 0 1 ... NaN 3754.39798 444.972727 8.437366 2013 1 23 4 1.358870e+09 NaN
920 1 2 2013-01-22 5720.0 667.0 1 1 1 0 1 ... NaN 3754.39798 444.972727 8.437366 2013 1 22 4 1.358784e+09 NaN
921 1 1 2013-01-21 5394.0 607.0 1 1 1 0 1 ... NaN 3754.39798 444.972727 8.437366 2013 1 21 4 1.358698e+09 NaN
922 1 7 2013-01-20 0.0 0.0 0 0 1 0 1 ... NaN 3754.39798 444.972727 8.437366 2013 1 20 3 1.358611e+09 NaN
923 1 6 2013-01-19 5182.0 688.0 1 0 1 0 1 ... NaN 3754.39798 444.972727 8.437366 2013 1 19 3 1.358525e+09 NaN
924 1 5 2013-01-18 4127.0 568.0 1 0 1 0 1 ... NaN 3754.39798 444.972727 8.437366 2013 1 18 3 1.358438e+09 NaN
925 1 4 2013-01-17 4044.0 503.0 1 0 1 0 1 ... NaN 3754.39798 444.972727 8.437366 2013 1 17 3 1.358352e+09 NaN
926 1 3 2013-01-16 4008.0 530.0 1 0 1 0 1 ... NaN 3754.39798 444.972727 8.437366 2013 1 16 3 1.358266e+09 NaN
927 1 2 2013-01-15 3900.0 512.0 1 0 1 0 1 ... NaN 3754.39798 444.972727 8.437366 2013 1 15 3 1.358179e+09 NaN
928 1 1 2013-01-14 4717.0 616.0 1 0 1 0 1 ... NaN 3754.39798 444.972727 8.437366 2013 1 14 3 1.358093e+09 NaN
929 1 7 2013-01-13 0.0 0.0 0 0 1 0 1 ... NaN 3754.39798 444.972727 8.437366 2013 1 13 2 1.358006e+09 NaN
930 1 6 2013-01-12 4952.0 646.0 1 0 1 0 1 ... NaN 3754.39798 444.972727 8.437366 2013 1 12 2 1.357920e+09 NaN
931 1 5 2013-01-11 4881.0 592.0 1 1 1 1 1 ... NaN 3754.39798 444.972727 8.437366 2013 1 11 2 1.357834e+09 NaN
932 1 4 2013-01-10 4892.0 615.0 1 1 1 1 1 ... NaN 3754.39798 444.972727 8.437366 2013 1 10 2 1.357747e+09 NaN
933 1 3 2013-01-09 5471.0 626.0 1 1 1 1 1 ... NaN 3754.39798 444.972727 8.437366 2013 1 9 2 1.357661e+09 NaN
934 1 2 2013-01-08 5580.0 654.0 1 1 1 1 1 ... NaN 3754.39798 444.972727 8.437366 2013 1 8 2 1.357574e+09 NaN
935 1 1 2013-01-07 7176.0 785.0 1 1 1 1 1 ... NaN 3754.39798 444.972727 8.437366 2013 1 7 2 1.357488e+09 NaN
936 1 7 2013-01-06 0.0 0.0 0 0 1 1 1 ... NaN 3754.39798 444.972727 8.437366 2013 1 6 1 1.357402e+09 NaN
937 1 6 2013-01-05 4997.0 635.0 1 0 1 1 1 ... NaN 3754.39798 444.972727 8.437366 2013 1 5 1 1.357315e+09 NaN
938 1 5 2013-01-04 4486.0 619.0 1 0 1 1 1 ... NaN 3754.39798 444.972727 8.437366 2013 1 4 1 1.357229e+09 NaN
939 1 4 2013-01-03 4327.0 578.0 1 0 1 1 1 ... NaN 3754.39798 444.972727 8.437366 2013 1 3 1 1.357142e+09 NaN
940 1 3 2013-01-02 5530.0 668.0 1 0 1 1 1 ... NaN 3754.39798 444.972727 8.437366 2013 1 2 1 1.357056e+09 NaN
941 1 2 2013-01-01 0.0 0.0 0 0 2 1 1 ... NaN 3754.39798 444.972727 8.437366 2013 1 1 1 1.356970e+09 NaN

942 rows × 28 columns

In [57]:
for i in train['Store'].unique():
    train.loc[(train['Store'] == i) & (train['Test'] == 0) & (train['Open'] == 1), 'Outlier'] = \
        check_outlier(train.loc[(train['Store'] == i) & (train['Test'] == 0)& (train['Open'] == 1)]['Sales'], 3.5)
In [58]:
train.loc[(train['Test'] == 0)&(train['Outlier']==True)].shape
Out[58]:
(6425, 29)
In [59]:
b = 1

plt.rcParams["figure.figsize"] = [20,10*5]

for i in range(b,b+25,1):
    stor = i

    # Normal sales
    X1 = train.loc[(train['Store'] == stor) & (train['Open'] == 1) & (train['Outlier'] == False)]
    y1 = train.loc[(train['Store'] == stor) & (train['Open'] == 1) & (train['Outlier'] == False)]['Sales']

    # Outliers
    X2 = train.loc[(train['Store'] == stor) & (train['Open'] == 1) & (train['Outlier'] == True)]
    y2 = train.loc[(train['Store'] == stor) & (train['Open'] == 1) & (train['Outlier'] == True)]['Sales']

    Xt = train.loc[(train['Store'] == stor)]
    
    plt.subplot(10,5,i)
    plt.plot(X1['Date'], y1, '-')
    plt.plot(X2['Date'], y2, 'r.')
    plt.title(i)
    plt.axis('on')
In [60]:
def rmspe(preds,y):
    return np.sqrt(np.mean(((y - preds)/y) ** 2))

def rmspe_exp(preds,y):
    y = np.expm1(y.get_label())
    preds = np.expm1(preds)
    return "rmspe", rmspe(preds,y)
In [61]:
params = {"objective": "gpu:reg:linear",
          "booster" : "gbtree",
          "eta": 0.01,
          "max_depth": 12,
          "subsample": 0.7,
          "colsample_bytree": 0.5,
#           "min_child_weight": 1,
          "silent": 1,
          "seed": 42,
          'nthread':6,
#           "tree_method":"gpu_hist"
          }
num_boost_round = 20000

print("训练准备完成")
# X_train = train.loc[(train['Outlier'] == False) & (train['Delete'] != False) ]
# X_valid = valid.loc[(valid['Outlier'] == False) & (valid['Delete'] != False) ]
# y_train = np.log(X_train.Sales)
# y_valid = np.log(X_valid.Sales)
X_train, X_valid, y_train, y_valid = train_test_split(train.loc[(train['Train'] == 1) & (train['Open'] == 1) & (train['Outlier'] == False)][features],
                                                    np.log1p(train.loc[(train['Train'] == 1) & (train['Open'] == 1) & (train['Outlier'] == False)].Sales),
                                                    test_size=0.1)

dtrain = xgb.DMatrix(X_train[features], y_train)
dvalid = xgb.DMatrix(X_valid[features], y_valid)

evallist = [(dtrain, 'train'), (dvalid, 'eval')]
evals_results={}
训练准备完成
In [65]:
print features
['Store', 'Promo', 'SchoolHoliday', 'StateHoliday', 'StoreType', 'Assortment', 'Promo2', 'CompetitionDistance', 'CompetitionOpenTS', 'Promo2TS', 'SalesPerDay', 'CustomersPerDay', 'SalesPerCustomersPerDay', 'IsPromoMonth', 'DayOfWeek', 'Month', 'Day', 'Year', 'WeekOfYear', 'DateTS']
In [64]:
gbm = xgb.train(params, dtrain, num_boost_round, evals=evallist,evals_result=evals_results, early_stopping_rounds=200, \
  feval=rmspe_exp, verbose_eval=50)
[0]	train-rmse:8.18189	eval-rmse:8.1805	train-rmspe:0.999863	eval-rmspe:0.999864
Multiple eval metrics have been passed: 'eval-rmspe' will be used for early stopping.

Will train until eval-rmspe hasn't improved in 200 rounds.
[50]	train-rmse:4.95464	eval-rmse:4.95344	train-rmspe:0.992651	eval-rmspe:0.992644
[100]	train-rmse:3.00251	eval-rmse:3.00149	train-rmspe:0.948337	eval-rmspe:0.948273
[150]	train-rmse:1.82252	eval-rmse:1.82161	train-rmspe:0.833834	eval-rmspe:0.833454
[200]	train-rmse:1.11023	eval-rmse:1.10949	train-rmspe:0.664617	eval-rmspe:0.663186
[250]	train-rmse:0.681924	eval-rmse:0.681408	train-rmspe:0.489628	eval-rmspe:0.486137
[300]	train-rmse:0.426359	eval-rmse:0.42615	train-rmspe:0.346969	eval-rmspe:0.339975
[350]	train-rmse:0.276605	eval-rmse:0.276947	train-rmspe:0.248592	eval-rmspe:0.23759
[400]	train-rmse:0.192334	eval-rmse:0.193498	train-rmspe:0.190733	eval-rmspe:0.175852
[450]	train-rmse:0.14737	eval-rmse:0.149454	train-rmspe:0.161109	eval-rmspe:0.143473
[500]	train-rmse:0.124648	eval-rmse:0.127671	train-rmspe:0.147036	eval-rmspe:0.128591
[550]	train-rmse:0.112932	eval-rmse:0.116688	train-rmspe:0.138116	eval-rmspe:0.121677
[600]	train-rmse:0.106357	eval-rmse:0.110755	train-rmspe:0.13224	eval-rmspe:0.118063
[650]	train-rmse:0.102083	eval-rmse:0.107001	train-rmspe:0.127035	eval-rmspe:0.115598
[700]	train-rmse:0.099136	eval-rmse:0.104529	train-rmspe:0.123007	eval-rmspe:0.113876
[750]	train-rmse:0.096697	eval-rmse:0.102542	train-rmspe:0.120965	eval-rmspe:0.112238
[800]	train-rmse:0.094549	eval-rmse:0.100833	train-rmspe:0.117067	eval-rmspe:0.11067
[850]	train-rmse:0.092954	eval-rmse:0.099632	train-rmspe:0.114359	eval-rmspe:0.10955
[900]	train-rmse:0.091503	eval-rmse:0.098528	train-rmspe:0.112649	eval-rmspe:0.108471
[950]	train-rmse:0.09025	eval-rmse:0.097619	train-rmspe:0.110134	eval-rmspe:0.10755
[1000]	train-rmse:0.08909	eval-rmse:0.096757	train-rmspe:0.108973	eval-rmspe:0.106681
[1050]	train-rmse:0.088165	eval-rmse:0.096119	train-rmspe:0.107369	eval-rmspe:0.106003
[1100]	train-rmse:0.08719	eval-rmse:0.095445	train-rmspe:0.104849	eval-rmspe:0.105288
[1150]	train-rmse:0.086253	eval-rmse:0.094801	train-rmspe:0.103883	eval-rmspe:0.104618
[1200]	train-rmse:0.085312	eval-rmse:0.094136	train-rmspe:0.102206	eval-rmspe:0.103903
[1250]	train-rmse:0.084496	eval-rmse:0.093568	train-rmspe:0.101288	eval-rmspe:0.103288
[1300]	train-rmse:0.083711	eval-rmse:0.093054	train-rmspe:0.100038	eval-rmspe:0.102756
[1350]	train-rmse:0.08297	eval-rmse:0.09257	train-rmspe:0.099045	eval-rmspe:0.102196
[1400]	train-rmse:0.082171	eval-rmse:0.092053	train-rmspe:0.09764	eval-rmspe:0.101664
[1450]	train-rmse:0.081487	eval-rmse:0.091618	train-rmspe:0.096707	eval-rmspe:0.101196
[1500]	train-rmse:0.080799	eval-rmse:0.091188	train-rmspe:0.095632	eval-rmspe:0.100715
[1550]	train-rmse:0.08015	eval-rmse:0.090795	train-rmspe:0.094558	eval-rmspe:0.100311
[1600]	train-rmse:0.079519	eval-rmse:0.09037	train-rmspe:0.093522	eval-rmspe:0.099876
[1650]	train-rmse:0.078917	eval-rmse:0.090029	train-rmspe:0.092419	eval-rmspe:0.099486
[1700]	train-rmse:0.078332	eval-rmse:0.08967	train-rmspe:0.091303	eval-rmspe:0.099079
[1750]	train-rmse:0.077784	eval-rmse:0.089387	train-rmspe:0.09058	eval-rmspe:0.098783
[1800]	train-rmse:0.077209	eval-rmse:0.089046	train-rmspe:0.089397	eval-rmspe:0.098425
[1850]	train-rmse:0.076746	eval-rmse:0.088819	train-rmspe:0.088452	eval-rmspe:0.098181
[1900]	train-rmse:0.076231	eval-rmse:0.08853	train-rmspe:0.087687	eval-rmspe:0.097875
[1950]	train-rmse:0.075748	eval-rmse:0.088258	train-rmspe:0.086984	eval-rmspe:0.097623
[2000]	train-rmse:0.075294	eval-rmse:0.088033	train-rmspe:0.086067	eval-rmspe:0.09737
[2050]	train-rmse:0.074825	eval-rmse:0.087772	train-rmspe:0.085282	eval-rmspe:0.097111
[2100]	train-rmse:0.074348	eval-rmse:0.087543	train-rmspe:0.084747	eval-rmspe:0.096855
[2150]	train-rmse:0.073889	eval-rmse:0.087295	train-rmspe:0.084044	eval-rmspe:0.096596
[2200]	train-rmse:0.073457	eval-rmse:0.087079	train-rmspe:0.083204	eval-rmspe:0.096386
[2250]	train-rmse:0.073062	eval-rmse:0.086891	train-rmspe:0.082479	eval-rmspe:0.096193
[2300]	train-rmse:0.072618	eval-rmse:0.086685	train-rmspe:0.081987	eval-rmspe:0.095993
[2350]	train-rmse:0.07226	eval-rmse:0.086513	train-rmspe:0.081413	eval-rmspe:0.095802
[2400]	train-rmse:0.071875	eval-rmse:0.086347	train-rmspe:0.080651	eval-rmspe:0.095627
[2450]	train-rmse:0.07149	eval-rmse:0.086165	train-rmspe:0.080126	eval-rmspe:0.095437
[2500]	train-rmse:0.071122	eval-rmse:0.08601	train-rmspe:0.079324	eval-rmspe:0.095277
[2550]	train-rmse:0.070784	eval-rmse:0.085867	train-rmspe:0.078459	eval-rmspe:0.095143
[2600]	train-rmse:0.070452	eval-rmse:0.085752	train-rmspe:0.07784	eval-rmspe:0.095025
[2650]	train-rmse:0.070088	eval-rmse:0.085596	train-rmspe:0.077249	eval-rmspe:0.094865
[2700]	train-rmse:0.069718	eval-rmse:0.085428	train-rmspe:0.076746	eval-rmspe:0.094716
[2750]	train-rmse:0.069372	eval-rmse:0.085287	train-rmspe:0.076142	eval-rmspe:0.094549
[2800]	train-rmse:0.069015	eval-rmse:0.085153	train-rmspe:0.07567	eval-rmspe:0.094442
[2850]	train-rmse:0.068627	eval-rmse:0.084986	train-rmspe:0.075045	eval-rmspe:0.094274
[2900]	train-rmse:0.068318	eval-rmse:0.084872	train-rmspe:0.074422	eval-rmspe:0.094151
[2950]	train-rmse:0.068016	eval-rmse:0.084755	train-rmspe:0.073895	eval-rmspe:0.094017
[3000]	train-rmse:0.067726	eval-rmse:0.084635	train-rmspe:0.073424	eval-rmspe:0.093888
[3050]	train-rmse:0.067425	eval-rmse:0.084524	train-rmspe:0.073059	eval-rmspe:0.093788
[3100]	train-rmse:0.067129	eval-rmse:0.084431	train-rmspe:0.072495	eval-rmspe:0.093697
[3150]	train-rmse:0.066814	eval-rmse:0.084325	train-rmspe:0.071793	eval-rmspe:0.093586
[3200]	train-rmse:0.066546	eval-rmse:0.084223	train-rmspe:0.071462	eval-rmspe:0.093483
[3250]	train-rmse:0.066243	eval-rmse:0.084119	train-rmspe:0.071128	eval-rmspe:0.093382
[3300]	train-rmse:0.065964	eval-rmse:0.08403	train-rmspe:0.070587	eval-rmspe:0.093288
[3350]	train-rmse:0.065685	eval-rmse:0.083952	train-rmspe:0.070182	eval-rmspe:0.093216
[3400]	train-rmse:0.065422	eval-rmse:0.083871	train-rmspe:0.06988	eval-rmspe:0.093143
[3450]	train-rmse:0.065159	eval-rmse:0.083798	train-rmspe:0.069479	eval-rmspe:0.093068
[3500]	train-rmse:0.064887	eval-rmse:0.083713	train-rmspe:0.069162	eval-rmspe:0.09298
[3550]	train-rmse:0.064631	eval-rmse:0.083641	train-rmspe:0.068831	eval-rmspe:0.092905
[3600]	train-rmse:0.064363	eval-rmse:0.083569	train-rmspe:0.068318	eval-rmspe:0.092827
[3650]	train-rmse:0.064106	eval-rmse:0.083504	train-rmspe:0.067949	eval-rmspe:0.092768
[3700]	train-rmse:0.063861	eval-rmse:0.083438	train-rmspe:0.067607	eval-rmspe:0.092706
[3750]	train-rmse:0.063577	eval-rmse:0.083359	train-rmspe:0.067229	eval-rmspe:0.09263
[3800]	train-rmse:0.063313	eval-rmse:0.08329	train-rmspe:0.06688	eval-rmspe:0.092559
[3850]	train-rmse:0.063063	eval-rmse:0.083223	train-rmspe:0.066526	eval-rmspe:0.092497
[3900]	train-rmse:0.062797	eval-rmse:0.083149	train-rmspe:0.066029	eval-rmspe:0.092424
[3950]	train-rmse:0.06255	eval-rmse:0.083087	train-rmspe:0.06571	eval-rmspe:0.092355
[4000]	train-rmse:0.062291	eval-rmse:0.083033	train-rmspe:0.065365	eval-rmspe:0.092304
[4050]	train-rmse:0.062074	eval-rmse:0.082982	train-rmspe:0.065128	eval-rmspe:0.092256
[4100]	train-rmse:0.061843	eval-rmse:0.082936	train-rmspe:0.064803	eval-rmspe:0.092212
[4150]	train-rmse:0.061596	eval-rmse:0.08288	train-rmspe:0.064523	eval-rmspe:0.092157
[4200]	train-rmse:0.061359	eval-rmse:0.082821	train-rmspe:0.064201	eval-rmspe:0.092097
[4250]	train-rmse:0.061124	eval-rmse:0.082764	train-rmspe:0.063939	eval-rmspe:0.092043
[4300]	train-rmse:0.060903	eval-rmse:0.082717	train-rmspe:0.063654	eval-rmspe:0.091997
[4350]	train-rmse:0.060658	eval-rmse:0.082663	train-rmspe:0.063345	eval-rmspe:0.091932
[4400]	train-rmse:0.060423	eval-rmse:0.082615	train-rmspe:0.063001	eval-rmspe:0.091895
[4450]	train-rmse:0.060199	eval-rmse:0.082564	train-rmspe:0.062703	eval-rmspe:0.091834
[4500]	train-rmse:0.059988	eval-rmse:0.08252	train-rmspe:0.062426	eval-rmspe:0.091792
[4550]	train-rmse:0.059785	eval-rmse:0.082476	train-rmspe:0.062181	eval-rmspe:0.091755
[4600]	train-rmse:0.059586	eval-rmse:0.082435	train-rmspe:0.061957	eval-rmspe:0.091716
[4650]	train-rmse:0.059369	eval-rmse:0.082397	train-rmspe:0.061699	eval-rmspe:0.091684
[4700]	train-rmse:0.059159	eval-rmse:0.082357	train-rmspe:0.061452	eval-rmspe:0.091647
[4750]	train-rmse:0.058971	eval-rmse:0.082327	train-rmspe:0.06121	eval-rmspe:0.091612
[4800]	train-rmse:0.058768	eval-rmse:0.082285	train-rmspe:0.060954	eval-rmspe:0.091573
[4850]	train-rmse:0.058569	eval-rmse:0.08225	train-rmspe:0.060694	eval-rmspe:0.091549
[4900]	train-rmse:0.058374	eval-rmse:0.082221	train-rmspe:0.060467	eval-rmspe:0.091519
[4950]	train-rmse:0.05818	eval-rmse:0.082189	train-rmspe:0.060217	eval-rmspe:0.091489
[5000]	train-rmse:0.057987	eval-rmse:0.082157	train-rmspe:0.059979	eval-rmspe:0.091456
[5050]	train-rmse:0.057804	eval-rmse:0.082131	train-rmspe:0.059751	eval-rmspe:0.091427
[5100]	train-rmse:0.057602	eval-rmse:0.082098	train-rmspe:0.059497	eval-rmspe:0.091398
[5150]	train-rmse:0.057417	eval-rmse:0.082063	train-rmspe:0.059276	eval-rmspe:0.091366
[5200]	train-rmse:0.057225	eval-rmse:0.082027	train-rmspe:0.05905	eval-rmspe:0.091334
[5250]	train-rmse:0.057041	eval-rmse:0.082002	train-rmspe:0.058833	eval-rmspe:0.091308
[5300]	train-rmse:0.056838	eval-rmse:0.081965	train-rmspe:0.058592	eval-rmspe:0.091267
[5350]	train-rmse:0.056644	eval-rmse:0.081938	train-rmspe:0.058356	eval-rmspe:0.091244
[5400]	train-rmse:0.05646	eval-rmse:0.081912	train-rmspe:0.058148	eval-rmspe:0.091219
[5450]	train-rmse:0.05627	eval-rmse:0.081886	train-rmspe:0.057925	eval-rmspe:0.091199
[5500]	train-rmse:0.056081	eval-rmse:0.081862	train-rmspe:0.057714	eval-rmspe:0.091162
[5550]	train-rmse:0.055894	eval-rmse:0.081841	train-rmspe:0.057508	eval-rmspe:0.091148
[5600]	train-rmse:0.055719	eval-rmse:0.081818	train-rmspe:0.057309	eval-rmspe:0.091119
[5650]	train-rmse:0.055542	eval-rmse:0.081795	train-rmspe:0.057102	eval-rmspe:0.091088
[5700]	train-rmse:0.055364	eval-rmse:0.081773	train-rmspe:0.056899	eval-rmspe:0.091068
[5750]	train-rmse:0.055176	eval-rmse:0.081755	train-rmspe:0.05667	eval-rmspe:0.091055
[5800]	train-rmse:0.05499	eval-rmse:0.081736	train-rmspe:0.056455	eval-rmspe:0.091047
[5850]	train-rmse:0.054823	eval-rmse:0.081714	train-rmspe:0.056262	eval-rmspe:0.091027
[5900]	train-rmse:0.054652	eval-rmse:0.081697	train-rmspe:0.056068	eval-rmspe:0.091012
[5950]	train-rmse:0.054486	eval-rmse:0.081684	train-rmspe:0.055881	eval-rmspe:0.091004
[6000]	train-rmse:0.054336	eval-rmse:0.081672	train-rmspe:0.055705	eval-rmspe:0.091
[6050]	train-rmse:0.054163	eval-rmse:0.081654	train-rmspe:0.055508	eval-rmspe:0.090987
[6100]	train-rmse:0.054006	eval-rmse:0.081639	train-rmspe:0.055333	eval-rmspe:0.090968
[6150]	train-rmse:0.053842	eval-rmse:0.081621	train-rmspe:0.055144	eval-rmspe:0.090949
[6200]	train-rmse:0.053695	eval-rmse:0.081603	train-rmspe:0.054981	eval-rmspe:0.090924
[6250]	train-rmse:0.053528	eval-rmse:0.081585	train-rmspe:0.054795	eval-rmspe:0.090898
[6300]	train-rmse:0.05337	eval-rmse:0.081567	train-rmspe:0.054619	eval-rmspe:0.090887
[6350]	train-rmse:0.05321	eval-rmse:0.081545	train-rmspe:0.054437	eval-rmspe:0.090861
[6400]	train-rmse:0.053052	eval-rmse:0.081534	train-rmspe:0.054264	eval-rmspe:0.090856
[6450]	train-rmse:0.052901	eval-rmse:0.081521	train-rmspe:0.054099	eval-rmspe:0.090844
[6500]	train-rmse:0.052744	eval-rmse:0.081505	train-rmspe:0.053926	eval-rmspe:0.090833
[6550]	train-rmse:0.052595	eval-rmse:0.081493	train-rmspe:0.053759	eval-rmspe:0.090816
[6600]	train-rmse:0.052441	eval-rmse:0.081476	train-rmspe:0.053587	eval-rmspe:0.090789
[6650]	train-rmse:0.052282	eval-rmse:0.081466	train-rmspe:0.053412	eval-rmspe:0.090787
[6700]	train-rmse:0.052124	eval-rmse:0.081458	train-rmspe:0.053239	eval-rmspe:0.090769
[6750]	train-rmse:0.051976	eval-rmse:0.081449	train-rmspe:0.053076	eval-rmspe:0.090757
[6800]	train-rmse:0.051812	eval-rmse:0.081438	train-rmspe:0.052896	eval-rmspe:0.090745
[6850]	train-rmse:0.051671	eval-rmse:0.081429	train-rmspe:0.052738	eval-rmspe:0.090746
[6900]	train-rmse:0.051517	eval-rmse:0.081418	train-rmspe:0.05256	eval-rmspe:0.090734
[6950]	train-rmse:0.051363	eval-rmse:0.081407	train-rmspe:0.052394	eval-rmspe:0.090719
[7000]	train-rmse:0.051211	eval-rmse:0.081399	train-rmspe:0.052226	eval-rmspe:0.090712
[7050]	train-rmse:0.051055	eval-rmse:0.081392	train-rmspe:0.052054	eval-rmspe:0.090705
[7100]	train-rmse:0.050911	eval-rmse:0.081387	train-rmspe:0.051899	eval-rmspe:0.090699
[7150]	train-rmse:0.050762	eval-rmse:0.081377	train-rmspe:0.051737	eval-rmspe:0.0907
[7200]	train-rmse:0.050613	eval-rmse:0.081372	train-rmspe:0.051574	eval-rmspe:0.090688
[7250]	train-rmse:0.050461	eval-rmse:0.081362	train-rmspe:0.05141	eval-rmspe:0.090676
[7300]	train-rmse:0.050307	eval-rmse:0.081356	train-rmspe:0.051243	eval-rmspe:0.090672
[7350]	train-rmse:0.050161	eval-rmse:0.081348	train-rmspe:0.051084	eval-rmspe:0.090668
[7400]	train-rmse:0.050029	eval-rmse:0.081344	train-rmspe:0.050943	eval-rmspe:0.090666
[7450]	train-rmse:0.049889	eval-rmse:0.081337	train-rmspe:0.05079	eval-rmspe:0.090661
[7500]	train-rmse:0.049746	eval-rmse:0.081332	train-rmspe:0.050631	eval-rmspe:0.090658
[7550]	train-rmse:0.049607	eval-rmse:0.081327	train-rmspe:0.050479	eval-rmspe:0.090654
[7600]	train-rmse:0.049475	eval-rmse:0.081321	train-rmspe:0.050337	eval-rmspe:0.090644
[7650]	train-rmse:0.049333	eval-rmse:0.081319	train-rmspe:0.050183	eval-rmspe:0.090648
[7700]	train-rmse:0.049206	eval-rmse:0.081308	train-rmspe:0.050047	eval-rmspe:0.090624
[7750]	train-rmse:0.04908	eval-rmse:0.081301	train-rmspe:0.049911	eval-rmspe:0.09061
[7800]	train-rmse:0.048953	eval-rmse:0.081298	train-rmspe:0.049774	eval-rmspe:0.090609
[7850]	train-rmse:0.048828	eval-rmse:0.081295	train-rmspe:0.049636	eval-rmspe:0.090602
[7900]	train-rmse:0.048707	eval-rmse:0.081291	train-rmspe:0.049506	eval-rmspe:0.090598
[7950]	train-rmse:0.048569	eval-rmse:0.081285	train-rmspe:0.049361	eval-rmspe:0.090593
[8000]	train-rmse:0.04844	eval-rmse:0.081283	train-rmspe:0.049222	eval-rmspe:0.090593
[8050]	train-rmse:0.048316	eval-rmse:0.081277	train-rmspe:0.049089	eval-rmspe:0.090594
[8100]	train-rmse:0.048186	eval-rmse:0.081274	train-rmspe:0.048948	eval-rmspe:0.09059
[8150]	train-rmse:0.048059	eval-rmse:0.081271	train-rmspe:0.04881	eval-rmspe:0.090583
[8200]	train-rmse:0.047933	eval-rmse:0.081266	train-rmspe:0.048676	eval-rmspe:0.090576
[8250]	train-rmse:0.047793	eval-rmse:0.081268	train-rmspe:0.048523	eval-rmspe:0.090583
[8300]	train-rmse:0.047661	eval-rmse:0.081266	train-rmspe:0.048382	eval-rmspe:0.090576
[8350]	train-rmse:0.047531	eval-rmse:0.081262	train-rmspe:0.048242	eval-rmspe:0.090571
[8400]	train-rmse:0.047412	eval-rmse:0.081256	train-rmspe:0.048113	eval-rmspe:0.090564
[8450]	train-rmse:0.047281	eval-rmse:0.081255	train-rmspe:0.047972	eval-rmspe:0.090568
[8500]	train-rmse:0.047144	eval-rmse:0.081252	train-rmspe:0.047826	eval-rmspe:0.090565
[8550]	train-rmse:0.047014	eval-rmse:0.081253	train-rmspe:0.047685	eval-rmspe:0.090565
[8600]	train-rmse:0.046884	eval-rmse:0.08125	train-rmspe:0.047547	eval-rmspe:0.090558
[8650]	train-rmse:0.046766	eval-rmse:0.081248	train-rmspe:0.047422	eval-rmspe:0.090552
[8700]	train-rmse:0.046646	eval-rmse:0.081245	train-rmspe:0.047296	eval-rmspe:0.09055
[8750]	train-rmse:0.046529	eval-rmse:0.081243	train-rmspe:0.047171	eval-rmspe:0.090552
[8800]	train-rmse:0.046406	eval-rmse:0.081245	train-rmspe:0.047041	eval-rmspe:0.090558
[8850]	train-rmse:0.046283	eval-rmse:0.081244	train-rmspe:0.046907	eval-rmspe:0.090558
[8900]	train-rmse:0.046162	eval-rmse:0.081243	train-rmspe:0.04678	eval-rmspe:0.090555
Stopping. Best iteration:
[8724]	train-rmse:0.046584	eval-rmse:0.081243	train-rmspe:0.047229	eval-rmspe:0.090546

In [67]:
import winsound 
winsound.Beep(400, 1000)
In [72]:
plt.rcParams["figure.figsize"] = [10,5]
# 训练过程可视化
plt.plot(evals_results['train']['rmspe'][400:])
plt.plot(evals_results['eval']['rmspe'][400:])
plt.title('model accuracy')
plt.ylabel('rmspe_value')
plt.xlabel('num_boost_round')
plt.legend(['train', 'eval'], loc='upper left')
plt.savefig("model1.png")  
In [84]:
print "生成测试集"
dtest = xgb.DMatrix(train.loc[(train['Test'] == 1)][features])
test_probs = gbm.predict(dtest)
result = pd.DataFrame({"Id": train.loc[(train['Test'] == 1)]['Id'].astype('int'), 'Sales': np.expm1(test_probs)*0.965})
result.to_csv("xgboost_submission10.csv", index=False)
生成测试集
In [73]:
gbm.save_model('final.model')
In [74]:
plt.rcParams["figure.figsize"] = [15,15]
In [75]:
xgb.plot_importance(gbm,ax=None, height=0.2, xlim=None, ylim=None, title='Feature importance', xlabel='F score', ylabel='Features', importance_type='weight', max_num_features=None, grid=True, show_values=False)
Out[75]:
<matplotlib.axes._subplots.AxesSubplot at 0x15acc2e8>
In [81]:
np.exp(np.log(5000))
Out[81]:
5000.0000000000036
In [82]:
np.expm1(np.log1p(5000))
Out[82]:
5000.0000000000027